import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, average_precision_score
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')
1.A. Import ‘signal-data.csv’ as DataFrame
df= pd.read_csv('signal-data.csv')
df.head()
1.B. Print 5 point summary and share at least 2 observations
df.describe().T
Since the 'Time' data is not useful in the context of the problem, it is dropped.
df = df.drop('Time',axis=1)
2.A. Write a for loop which will remove all the features with 20%+ Null values and impute rest with mean of the feature.
df.isnull().sum()
df.shape
print('Number of rows : ',df.shape[0])
print('Number of columns: ',df.shape[1])
df['Pass/Fail'].unique()
df['Pass/Fail'] = df['Pass/Fail'].replace(to_replace=1,value=0)
df['Pass/Fail'] = df['Pass/Fail'].replace(to_replace=-1,value=1)
d = df.isnull().sum() * 100 / len(df)
j = []
for i in d.keys():
if(d[i] >=0.20):
print(i, d[i])
j.append(i)
df.drop(j, axis = 1, inplace = True)
# Fill remaining missing values
for column in df.columns:
df[column] = df[column].fillna(df[column].mean())
df.sample(5)
df.isnull().sum()
2.B. Identify and drop the features which are having same value for all the rows.
def remove_duplicates(df):
df_std = df.std()
duplicate_features = df_std[df_std == 0].index
print('Number of features removed with same row values t:',
len(duplicate_features))
df = df.drop(labels=duplicate_features, axis=1)
return (df)
dup = remove_duplicates(df)
2.C. Drop other features if required using relevant functional knowledge. Clearly justify the same
#after dropping the constant signal
row,column=df.shape
print('After dropping the constant signals the dataset contains', row, 'rows and', column, 'columns')
2.D. Check for multi-collinearity in the data and take necessary action.
df.corr()
plt.figure(figsize = (10,6))
sns.heatmap(abs(df.corr()), vmin = 0, vmax = 1)
plt.show()
2.E. Make all relevant modifications on the data using both functional/logical reasoning/assumptions
3.A. Perform a detailed univariate Analysis with appropriate detailed comments after each analysis
sns.histplot(data=df,x='Pass/Fail');
df['Pass/Fail'].value_counts()
3.B. Perform bivariate and multivariate analysis with appropriate detailed comments after each analysis.
plt.figure(figsize = (10,6))
sns.heatmap(abs(df), vmin = 0, vmax = 1)
plt.show()
fig = px.pie(
df['Pass/Fail'].value_counts(),
values='Pass/Fail',
names=["PASS", "FAIL"],
title="Class Distribution",
width=500
)
fig.show()
4.A. Segregate predictors vs target attributes
X = df.drop(labels='Pass/Fail',axis=1)
y = df['Pass/Fail']
X = X.add_prefix('f')
4.B. Check for target balancing and fix it if found imbalanced
y.value_counts()
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
y.value_counts()
4.C. Perform train-test split and standardise the data or vice versa if required
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
4.D. Check if the train and test data have similar statistical characteristics when compared with original data.
X_train.describe()
X_test.describe()
y_train.describe()
y_test.describe()
5.A. Use any Supervised Learning technique to train a model.
logit = LogisticRegression()
logit.fit(X_train, y_train)
logit_pred = logit.predict(X_test)
print('Accuracy on Training data:',logit.score(X_train, y_train) )
print('Accuracy on Test data:',logit.score(X_test, y_test) )
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
# instantiate learning model (k = 3)
knn = KNeighborsClassifier(n_neighbors = 3)
# fitting the model
knn.fit(X_train, y_train)
# predict the response
y_pred = knn.predict(X_test)
# evaluate accuracy
print(accuracy_score(y_test, y_pred))
5.B. Use cross validation techniques
num_folds = 50
seed = 7
kfold = KFold(n_splits=num_folds, random_state=None)
model = LogisticRegression()
results = cross_val_score(model,X,y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
# loocv to automatically evaluate the performance of a random forest classifier
from numpy import mean
from numpy import std
from sklearn.datasets import make_blobs
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
# create dataset
X, y = make_blobs(n_samples=100, random_state=1)
# create loocv procedure
cv = LeaveOneOut()
# create model
model =LogisticRegression()
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
from sklearn.model_selection import cross_val_score
dt = DecisionTreeClassifier()
score1 = cross_val_score(dt, X, y, cv = 10).mean()
print(f'Cross validation score of Decision tree = {score1}')
#Random Forest rf
rf = RandomForestClassifier()
score2 = cross_val_score(rf, X, y, cv = 10).mean()
print(f'Cross validation score of Random forest = {score2}')
5.C.Apply hyper-parameter tuning techniques to get the best accuracy
#Logistic Regression with only feedback columns
from sklearn.linear_model import LogisticRegression #importing logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test) # Predictions from logistic regression
score1 = lr.score(X_test, y_test)
score1
#Logistic Regression with only feedback columns
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify = y)
lr = LogisticRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
score2 = lr.score(X_test, y_test)
print(f'Number of features used = {len(X_train)}')
print(f'Accuracy = {score2}')
from sklearn.tree import DecisionTreeClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
score3 = dt.score(X_test, y_test)
pred = dt.predict(X_test)
print(f"Decision tree acccuracy score: {score3}")
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
score4 = rf.score(X_test, y_test)
print(f'Random Forest accuracy score = {score4}')
from sklearn.model_selection import cross_val_score
#For Decision Tree dt
score5 = cross_val_score(dt, X, y, cv = 10).mean()
print(f'Cross validation score of Decision tree = {score5}')
#Random Forest rf
score6 = cross_val_score(rf, X, y, cv = 10).mean()
print(f'Cross validation score of Random forest = {score6}')
from sklearn.model_selection import GridSearchCV
parameters = {'bootstrap': [True],
'max_depth': [10, 20, 30, 40, 50],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4, 8],
'n_estimators': [100]}
clf = GridSearchCV(RandomForestClassifier(), parameters, cv = 5, verbose = 2, n_jobs= 4)
clf.fit(X, y)
clf.best_params_
rf = RandomForestClassifier(bootstrap= True,
max_depth= 30,
max_features= 'sqrt',
min_samples_leaf= 1,
n_estimators= 100)
rf.fit(X_train, y_train)
score7 = cross_val_score(rf, X_train, y_train, cv = 5).mean()
score7
data = {'Technique' : ['Logistic Regression', "LR", 'Decision tree',
'Random forest', 'DT CV','RF CV','Tuned RF CV'],
'Score' : [score1, score2, score3, score4, score5, score6, score7] }
result = pd.DataFrame(data)
result
5.D. Use any other technique/method which can enhance the model performance.
X = df.drop(labels='Pass/Fail',axis=1)
y = df['Pass/Fail']
X = X.add_prefix('f')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
from sklearn.decomposition import PCA
pca = PCA(10)# Initialize PCA object
#pca = PCA(.95)
pca.fit(X_train)
X_train_pca = pca.transform(X_train) # PCs for the train data
X_test_pca = pca.transform(X_test) # PCs for the test data
X_train_pca.shape, X_test_pca.shape
pca.explained_variance_
lr = LogisticRegression()
lr.fit(X_train_pca, y_train)
score9 = lr.score(X_test_pca, y_test)
score9
dt = DecisionTreeClassifier()
dt.fit(X_train_pca, y_train)
score10 = dt.score(X_test_pca, y_test)
score10
rf = RandomForestClassifier(bootstrap = True, max_depth = 30, max_features ='auto', min_samples_leaf = 1, n_estimators = 100)
rf.fit(X_train_pca, y_train)
score11 = rf.score(X_test_pca, y_test)
score11
rf = RandomForestClassifier(bootstrap = True, max_depth = 30, max_features ='auto', min_samples_leaf = 1, n_estimators = 100)
rf.fit(X_train_pca, y_train)
score11 = rf.score(X_test_pca, y_test)
score11
lr = LogisticRegression()
score12 = cross_val_score(lr,X_train_pca, y_train , cv = 5).mean()
dt = DecisionTreeClassifier()
score13 = cross_val_score(dt, X_train_pca, y_train, cv = 5).mean()
rf = RandomForestClassifier(bootstrap = True, max_depth = 10, max_features ='sqrt', min_samples_leaf = 1, n_estimators = 100)
score14 = cross_val_score(rf, X_train_pca, y_train, cv = 5).mean()
result = pd.DataFrame({'Algorithm' : ['Logistic Regression', 'Deision Tree', 'Random Forest'],
'Accuracy_score': [score9, score10, score11],
'Cross_val_score' : [score12, score13, score14]})
result
5.E. Display and explain the classification report in detail.
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=50)
#instantiate the model
logistic_regression = LogisticRegression()
#fit the model using the training data
logistic_regression.fit(X_train,y_train)
#use model to make predictions on test data
y_pred = logistic_regression.predict(X_test)
print(classification_report(y_test, y_pred))
#instantiate the model
dt =DecisionTreeClassifier()
#fit the model using the training data
dt.fit(X_train,y_train)
#use model to make predictions on test data
y_pred = logistic_regression.predict(X_test)
print(classification_report(y_test, y_pred))
#instantiate the model
rf = RandomForestClassifier(bootstrap = True, max_depth = 10, max_features ='sqrt', min_samples_leaf = 1, n_estimators = 100)
#fit the model using the training data
rf.fit(X_train,y_train)
#use model to make predictions on test data
y_pred = logistic_regression.predict(X_test)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g');
5.F. Apply the above steps for all possible models that you have learnt so far
Bagging :
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dt, n_estimators=50,random_state=1)
bgcl = bgcl.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix
y_predict = bgcl.predict(X_test)
print(bgcl.score(X_test , y_test))
cm=confusion_matrix(y_test, y_predict)
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g');
AdaBoosting :
from sklearn.ensemble import AdaBoostClassifier
ada_bcl = AdaBoostClassifier(n_estimators=10, random_state=1)
ada_bcl = ada_bcl.fit(X_train, y_train)
y_predict = ada_bcl.predict(X_test)
print(ada_bcl.score(X_test , y_test))
cm=confusion_matrix(y_test, y_predict)
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g');
GradientBoost :
from sklearn.ensemble import GradientBoostingClassifier
gra_bcl = GradientBoostingClassifier(n_estimators = 50,random_state=1)
gra_bcl = gra_bcl.fit(X_train, y_train)
y_predict = gra_bcl.predict(X_test)
print(gra_bcl.score(X_test, y_test))
cm=confusion_matrix(y_test, y_predict)
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g');
# first, initialize the classificators
tree= DecisionTreeClassifier(random_state=24) # using the random state for reproducibility
forest= RandomForestClassifier(random_state=24)
knn= KNeighborsClassifier()
svm= SVC(random_state=24)
xboost= XGBClassifier(random_state=24)
# now, create a list with the objects
models= [tree, forest, knn, svm, xboost]
for model in models:
model.fit(X_train, y_train) # fit the model
y_pred= model.predict(X_test) # then predict on the test set
accuracy= accuracy_score(y_test, y_pred) # this gives us how often the algorithm predicted correctly
clf_report= classification_report(y_test, y_pred) # with the report, we have a bigger picture, with precision and recall for each class
print(f"The accuracy of model {type(model).__name__} is {accuracy:.2f}")
print(clf_report)
print("\n")
6.A. Display and compare all the models designed with their train and test accuracies
report=pd.DataFrame({'Algorithms':[logistic_regression,dt,rf,bgcl,ada_bcl,
gra_bcl,knn,SVC,XGBClassifier],
'Accuracy':[0.94,0.94,0.94,0.92,0.93,0.93,0.93,0.94,0.94]
})
report
6.B. Select the final best trained model along with your detailed comments for selecting this model.
6.C. Pickle the selected model for future use.
import pickle
regressor = LogisticRegression()
#Fitting model with training data
regressor.fit(X, y)
pickle.dump(regressor, open('model.pkl','wb'))
6.D. Write your conclusion on the results